# Loading required R packages
library(ggplot2)
library(plotly)
library(shiny)
library(gridExtra)
library(xlsx)
library(MASS)
library(sf)
library(akima)
Sys.setenv('MAPBOX_TOKEN' = 'pk.eyJ1IjoibGFrc2hpZGFhIiwiYSI6ImNqbWIyOHN2NTRlZ3kzam10aTljeGNybWgifQ.8EG9Y6r024e-TGk79f7GpA')
Reading Data
Data Mugging
Quantile Computation
get_outliers <- function(x){
quantile_values = quantile(x, probs = c(0.25, 0.75))
q1 = quantile_values["25%"]
q3 = quantile_values["75%"]
return(c(which((x > (q3+1.5*(q3-q1)))), which(x < (q1-1.5*(q3-q1)))))
}
Scaling the Data
baseball_scaled <- scale(baseball_data[,3:length(baseball_data)])
Distance Matrix between rows
distance_matrix <- dist(baseball_scaled)
Non-metric MDS
mds_result <- isoMDS(distance_matrix, k=2, p=2)
## initial value 19.856833
## iter 5 value 16.319153
## iter 10 value 16.046215
## final value 15.935476
## converged
coords <- mds_result$points
coords_mds <- as.data.frame(coords)
baseball_data_with_mds <- baseball_data
baseball_data_with_mds$MDS_V1 <- coords_mds$V1
baseball_data_with_mds$MDS_V2 <- coords_mds$V2
Single Plots
Density Plot
Density Plot with Outlier Highlight using GGplot2
density_plot_infection_risk = ggplot(senic_data) +
ggtitle("Density plot of Infection_Risk") +
geom_density(aes(x=Infection_Risk), fill = "lightblue") +
geom_point(data=senic_data[get_outliers(senic_data$Infection_Risk),],
aes(x=Infection_Risk, y=0, colour="Outliers"),
shape=23, size=2, fill="red") +
scale_color_manual(values = c("darkblue","black")) +
labs(colour="Legend") +
theme(plot.title = element_text(hjust = 0.5), legend.position = "right")
density_plot_infection_risk

Density Plot with Outlier Highlight using Plotly (converting from ggplot2)
x <- ggplotly(p=density_plot_infection_risk)
x
Histogram Plot
Histogram plot with Outlier Highlight using Plotly
outliers = senic_data[get_outliers(senic_data$Infection_Risk),c("Infection_Risk")]
senic_data$zero = 0
plot_ly(senic_data, x=~Infection_Risk) %>%
add_histogram(name="Histogram count") %>%
filter(is.element(Infection_Risk, outliers)) %>%
add_markers(x=~Infection_Risk,y=~zero, name="Outliers",
marker=list(symbol="diamond", size=10, line = list(color="black", width=1))) %>%
layout(title="Histogram of Infection_Risk", yaxis=list(title="Count"))
Scatter Plot
Simple scatter plot with colour
ggplot(senic_data) + geom_point(aes(x=Number_of_Nurses, y=Infection_Risk, color=Number_of_Beds)) +
ggtitle("Scatterplot of Infection_Risk vs Number_of_Nurses") +
theme(plot.title = element_text(hjust = 0.5))

Scatter Plot with Discreetization (split a variable into classes)
ggplot(olive_data) +
geom_point(aes(x = oleic, y = palmitic,
color=cut_interval(olive_data$linolenic, n = 4))) +
ggtitle("Dependence of Palmitic vs Oleic vs Linolenic") +
theme(plot.title = element_text(hjust = 0.5)) +
labs(color = 'Linolenic range')

Scatter plot size varied
ggplot(olive_data) + geom_point(aes(x = oleic, y = palmitic, size = cut_interval(linolenic, n = 4))) +
ggtitle("Dependence of Palmitic vs Oleic vs Linolenic") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_size_manual(name = "Linolenic range", values = c(1, 2, 3, 4))

Scatter plot angle varied
# Pre-processing - Setting angle values based on category
olive_data$linolenic_class <- cut_interval(olive_data$linolenic, n = 4)
levels(olive_data$linolenic_class) <- (0:3) * (pi/4)
olive_data$linolenic_class <- as.numeric(as.character(olive_data$linolenic_class))
ggplot(olive_data, aes(x=oleic, y=palmitic)) + geom_point() +
geom_spoke(aes(angle = olive_data$linolenic_class), radius=40) +
ggtitle("Dependence of Palmitic vs Oleic vs Linolenic
Legend
Orientation angle of spoke : Linolenic class
0:(0,18.5], 45:(18.5,37], 90:(37,55.5], 135:(0,18.5] ") +
theme(plot.title = element_text(hjust = 0.5))

Scatter plot with color, size and shape
ggplot(olive_data)+
geom_point(aes(x = oleic, y = eicosenoic, color = cut_interval(linoleic, n = 3),
shape = cut_interval(palmitic, n = 3),
size = cut_interval(palmitoleic, n = 3))) +
scale_size_manual(values = c(2,3,4)) +
labs(shape = "Palmitic range", color = "Linoleic range", size = "Palmitoleic range") +
ggtitle("Dependence of Oleic vs Eicosenoic vs Linoleic vs Palmitic vs Palmitoleic") +
theme(plot.title = element_text(hjust = 0.5))

Scatter Plot of MDS
baseball_scaled <- scale(baseball_data[,3:length(baseball_data)])
### Distance Matrix between rows
distance_matrix <- dist(baseball_scaled)
### Non-metric MDS
mds_result <- isoMDS(distance_matrix, k=2, p=2)
## initial value 19.856833
## iter 5 value 16.319153
## iter 10 value 16.046215
## final value 15.935476
## converged
coords <- mds_result$points
coords_mds <- as.data.frame(coords)
baseball_data_with_mds <- baseball_data
baseball_data_with_mds$MDS_V1 <- coords_mds$V1
baseball_data_with_mds$MDS_V2 <- coords_mds$V2
plot_ly(baseball_data_with_mds, x=~MDS_V1, y=~MDS_V2) %>%
add_markers(color=~League, colors = c("blue", "red"),
text=baseball_data_with_mds$Team, hoverinfo="text") %>%
layout(title="Scatterplot of the 2 MDS variables")
Shepard Plot
Shepard plot shows the fit of MDS, the distance in original dataset vs. distance in reordered dataset should be similar
baseball_scaled <- scale(baseball_data[,3:length(baseball_data)])
### Distance Matrix between rows
distance_matrix <- dist(baseball_scaled)
mds_result <- isoMDS(distance_matrix, k=2, p=2)
## initial value 19.856833
## iter 5 value 16.319153
## iter 10 value 16.046215
## final value 15.935476
## converged
coords <- mds_result$points
shp <- Shepard(distance_matrix, coords)
delta <- as.numeric(distance_matrix)
D <- as.numeric(dist(coords))
n <- nrow(coords)
index <- matrix(1:n, nrow=n, ncol=n)
index1 <- as.numeric(index[lower.tri(index)])
n <- nrow(coords)
index <- matrix(1:n, nrow=n, ncol=n, byrow = T)
index2 <- as.numeric(index[lower.tri(index)])
plot_ly()%>%
add_markers(x=~delta, y=~D, name="Observation pairs", hoverinfo = 'text',
text = ~paste('Obj 1: ',
rownames(baseball_data_with_mds)[index1],
'<br> Obj 2: ',
rownames(baseball_data_with_mds)[index2])) %>%
add_lines(x=~shp$x, y=~shp$yf, name="Isotonic regression line") %>%
layout(title="Shepard's plot of MDS operation")
Pie Charts
Simple pie chart
plot_ly(olive_data,labels=~Area,type='pie',textinfo = "none") %>%
layout(title = "Pie chart of proportion of oils coming from different areas")
2D density contour plot
Simple 2D plot using ggplot2
ggplot(olive_data)+geom_density_2d(aes(x=eicosenoic, y=linoleic)) +
ggtitle("Contour plot of Linoleic vs Eicosenoic") +
theme(plot.title = element_text(hjust = 0.5))

Dot Map Plots (World Map)
Dot Map (Map with scatter plots)
# 2004
plot_mapbox(aegypti_data[aegypti_data$YEAR == 2004,], lat = ~Y, lon = ~X) %>%
add_markers(color = ~VECTOR, hoverinfo = "text",
text = ~paste(COUNTRY), alpha = 0.7) %>%
layout(title = "Dot map of mosquito distribution in the world (2004)")
Choropleth Map
Chorpleth Map with Equirectangular Projection
# Data aggregation
country_aggregate = aggregate(aegypti_data[,c("COUNTRY", "COUNTRY_ID")],
by = list(aegypti_data$COUNTRY, aegypti_data$COUNTRY_ID), FUN=length)
country_aggregate$COUNTRY = NULL
colnames(country_aggregate) = c("COUNTRY", "COUNTRY_ID", "Count")
plot_geo(country_aggregate) %>% add_trace(
z = ~Count,
text = ~COUNTRY, locations = ~COUNTRY_ID) %>%
layout(title = "Choropleth plot of number of mosquitoes",
geo = list(projection = list(type = "equirectangular")))
Choropleth plot with Equirectangular Projection and log
plot_geo(country_aggregate) %>%
add_trace(
z = ~log(Count) ,
text = ~paste(COUNTRY, "\n Count: ", Count), locations = ~COUNTRY_ID,
hoverinfo = "text"
) %>%
layout(
title = "Choropleth plot of number of mosquitoes",
geo = list(projection = list(type = "equirectangular")))
Choropleth plot with Conic Area Projection and log
plot_geo(country_aggregate) %>%
add_trace(
z = ~log(Count) ,
text = ~paste(COUNTRY, "\n Count: ", Count), locations = ~COUNTRY_ID
) %>%
layout(
title = "Choropleth plot of number of mosquitoes",
geo = list(projection = list(type = "conic equal area")))
Multiple Plots
Density Plots
Density Plot with Outlier Highlight
plot_density_with_outliers <- function(var_data, col_name){
p <- NULL
df_data = setNames(data.frame(var_data),col_name)
if(length(get_outliers(df_data[[col_name]])) > 0){
p <- ggplot(df_data) +
geom_density(aes_string(x=col_name), fill = "lightblue", color = "darkblue") +
geom_point(data=df_data[get_outliers(df_data[[col_name]]),,drop=FALSE],
aes_string(x=col_name), y=0, shape=23, size=2, colour="black", fill="red")
}
else{
p <- ggplot(df_data) +
ggtitle(paste("")) +
theme(plot.title = element_text(hjust = 0.5)) +
geom_density(aes_string(x=col_name), fill = "lightblue", color = "darkblue")
}
return(p)
}
categorical_columns = c("Medical_School_Affiliation", "Region")
ID_columns = c("Identification_Number")
quantitative_columns = setdiff(colnames(senic_data), c(categorical_columns, ID_columns))
plot_list = mapply(plot_density_with_outliers, senic_data[, quantitative_columns],
colnames(senic_data[, quantitative_columns]), SIMPLIFY = FALSE)
plot_matrix <- arrangeGrob(grobs = plot_list, ncol = 2)
grid.arrange(plot_matrix, respect=TRUE, top="Density plots of SENIC data variables")

Shiny
#UI component
ui <- fluidPage(
sliderInput(inputId="bw_value", label="Choose bandwidth size", value=4.5, min=0.1, max=80),
checkboxGroupInput("selected_variables", "Variables to show: ", quantitative_columns, inline=TRUE),
plotOutput("densPlot", height = "650px")
)
plot_density_with_outliers_shiny <- function(df_data, col_name, bw){
p <- NULL
if(length(get_outliers(senic_data[[col_name]])) > 0){
p <- ggplot(df_data) +
ggtitle(paste("Density plot of ", col_name)) +
theme(plot.title = element_text(hjust = 0.5)) +
geom_density(aes_string(x=col_name), fill = "lightblue", color = "darkblue", bw=bw) +
geom_point(data=df_data[get_outliers(df_data[[col_name]]),],
aes_string(x=col_name, y=0), shape=23, size=2, colour="black", fill="red")
}
else{
p <- ggplot(df_data) +
ggtitle(paste("Density plot of ", col_name)) +
theme(plot.title = element_text(hjust = 0.5)) +
geom_density(aes_string(x=col_name), fill = "lightblue", color = "darkblue", bw=bw)
}
return(p)
}
server <- function(input, output) {
output$densPlot <- renderPlot({
selected_columns = input$selected_variables
plot_list = vector("list", length(selected_columns))
if(length(selected_columns) > 0){
for(i in 1:length(selected_columns)){
plot_list[[i]] = plot_density_with_outliers_shiny(senic_data, selected_columns[i],
bw = input$bw_value)
}
plot_matrix <- arrangeGrob(grobs = plot_list, ncol = 2)
grid.arrange(plot_matrix)
}
})
}
shinyApp(ui = ui, server = server, options = list(width="800px", height="900px"))